GSEA_Overlap_with_IREGenes.Rmd notebook, Fisher’s exact test was used to determine whether MSigDB gene sets were enriched in the zebrafish predicted IRE gene sets we defined.h_df <- lapply(h_l, cbind)
Error in lapply(h_l, cbind) : object 'h_l' not found
h_mapped <- readRDS(file.path(genesetsDir, "ens_h_mapped.rds"))
Error in file.path(genesetsDir, "ens_h_mapped.rds") :
object 'genesetsDir' not found
# Mouse entrez and ensembl ids:
mEns <- org.Mm.egENSEMBL %>%
as.data.frame %>%
set_colnames(c("m_entrezgene","m_ensembl"))
# Create a data.frame to map between human entrezgenes and zebrafish ensembl IDs.
# BioMart only includes homolog mappings for ensembl IDs which is why we need to
# retrieve human ensembl IDs, then join to the humanEntrezEns data.frame,
# in order to get the desired human entrezgenes to zebrafish ensembl ID mapping.
mMart <- useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl")
getFromBiomart <- c("ensembl_gene_id", "hsapiens_homolog_ensembl_gene")
mAndHumanEnsGenes <- getBM(getFromBiomart, values = unique(mEns$m_ensembl), mart = mMart) %>%
set_colnames(c("zebrafish_ensembl", "human_ensembl")) %>%
left_join(human_entrez2Ens, by = "human_ensembl") %>%
dplyr::select(-human_ensembl) %>%
dplyr::filter(complete.cases(.))
# mToHu <- getBM(c("ensembl_gene_id", "hsapiens_homolog_ensembl_gene"),
# values = unique(mEns$m_ensembl), mart = mMart) %>%
# set_colnames(c("m_ensembl","hu_ensembl"))
mapHumanGS2Mouse<- function(x) {
x %>%
as.data.frame %>%
set_colnames("human_entrezgene") %>%
left_join(mAndHumanEnsGenes, by = "human_entrezgene") %>%
dplyr::filter(complete.cases(.)) %>%
dplyr::select(-human_entrezgene) %>%
as.list %>%
unname %>%
.[[1]] %>%
unique
}
h_mapped_m <- lapply(h_df, mapHumanGS2Mouse)
c1_mapped_m <- lapply(c1_df, mapHumanGS2Mouse)
c2_mapped_m <- lapply(c2_df, mapHumanGS2Mouse)
c3_mapped_m <- lapply(c3_df, mapHumanGS2Mouse)
c4_mapped_m <- lapply(c4_df, mapHumanGS2Mouse)
c5_mapped_m <- lapply(c5_df, mapHumanGS2Mouse)
c6_mapped_m <- lapply(c6_df, mapHumanGS2Mouse)
c7_mapped_m <- lapply(c7_df, mapHumanGS2Mouse)
h_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_h_mapped_m.rds"))
c1_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c1_mapped_m.rds"))
c2_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c2_mapped_m.rds"))
c3_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c3_mapped_m.rds"))
c4_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c4_mapped_m.rds"))
c5_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c5_mapped_m.rds"))
c6_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c6_mapped_m.rds"))
c7_mapped_m %>% saveRDS(file.path(genesetsDir, "ens_c7_mapped_m.rds"))
h_mapped_m <- readRDS(file.path(genesetsDir, "ens_h_mapped_m.rds"))
c1_mapped_m <- readRDS(file.path(genesetsDir, "ens_c1_mapped_m.rds"))
c2_mapped_m <- readRDS(file.path(genesetsDir, "ens_c2_mapped_m.rds"))
c3_mapped_m <- readRDS(file.path(genesetsDir, "ens_c3_mapped_m.rds"))
c4_mapped_m <- readRDS(file.path(genesetsDir, "ens_c4_mapped_m.rds"))
c5_mapped_m <- readRDS(file.path(genesetsDir, "ens_c5_mapped_m.rds"))
c6_mapped_m <- readRDS(file.path(genesetsDir, "ens_c6_mapped_m.rds"))
c7_mapped_m <- readRDS(file.path(genesetsDir, "ens_c7_mapped_m.rds"))
mouseIreGenes_h<-readRDS(here::here("R/IREGenes/data/mouseIreGenes_h.rds"))
humanIreGenes<-readRDS(here::here("R/IREGenes/data/human_ireGenes.rds"))
zebrafishIreGenes_h<-readRDS(here::here("R/IREGenes/data/zebrafishIreGenes_h.rds"))
h_tib <- h_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(h_mapped), source = "h")
c2_tib <- c2_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c2_mapped), source = "c2")
c3_tib <- c3_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c3_mapped), source = "c3")
c5_tib <- c5_mapped %>% tibble %>% set_colnames(c("ids")) %>% mutate(geneset = names(c5_mapped), source = "c5")
gs <- bind_rows(h_tib, c2_tib, c3_tib, c5_tib)
gs
humanIreGenes%>%str
List of 4
$ ire3_all: chr [1:3041] "ENSG00000187642" "ENSG00000162571" "ENSG00000224051" "ENSG00000107404" ...
$ ire5_all: chr [1:1265] "ENSG00000188157" "ENSG00000162572" "ENSG00000175756" "ENSG00000235169" ...
$ ire3_hq : chr [1:325] "ENSG00000162571" "ENSG00000248333" "ENSG00000215790" "ENSG00000169598" ...
$ ire5_hq : chr [1:106] "ENSG00000175756" "ENSG00000176083" "ENSG00000085831" "ENSG00000177301" ...
allIREGenes <- c(humanIreGenes$ire3_all, humanIreGenes$ire5_all)
allIREGenes %>% head
[1] "ENSG00000187642" "ENSG00000162571" "ENSG00000224051" "ENSG00000107404" "ENSG00000242485" "ENSG00000179403"
We wish to see the overlap between the genesets in gs with the humanIreGenes (including 3’ and 5’ IRE genes), as well as separately, with the 3’ IRE genes and 5’ IRE genes. We will add this information into the gs tibble.
gs %<>% rowwise() %>% mutate(
n = length(ids), # Number of genes in the geneset
n_with_ire = sum(ids %in% allIREGenes), # Number of genes in the gene set which have 3' or 5' predicted IREs
n_without_ire = (n - n_with_ire),
universe_with_ire = sum(human_entrez2Ens$human_ensembl %in% allIREGenes & !(human_entrez2Ens$human_ensembl %in% ids)),
universe_without_ire = (length(human_entrez2Ens$human_ensembl) - universe_with_ire),
n_with_ire3 = sum(ids %in% humanIreGenes$ire3_all), # Number of genes in the gene set which have 3' predicted IREs
n_without_ire3= (n - n_with_ire),
universe_with_ire3 = sum(human_entrez2Ens$human_ensembl %in% humanIreGenes$ire3_all & !(human_entrez2Ens$human_ensembl %in% ids)),
universe_without_ire3 = (length(human_entrez2Ens$human_ensembl) - universe_with_ire3),
n_with_ire5 = sum(ids %in% humanIreGenes$ire5_all), # Number of genes in the gene set which have 5' predicted IREs
n_without_ire5= (n - n_with_ire),
universe_with_ire5 = sum(human_entrez2Ens$human_ensembl %in% humanIreGenes$ire5_all & !(human_entrez2Ens$human_ensembl %in% ids)),
universe_without_ire5 = (length(human_entrez2Ens$human_ensembl) - universe_with_ire5)
) %>% ungroup()
gs
gs_mat <- gs %>%
dplyr::select(n_with_ire,
n_without_ire,
universe_with_ire,
universe_without_ire) %>%
apply(X = ., MARGIN = 1, FUN = function(x){
x %>%
matrix(2,2) %>%
t %>%
list()
}) %>% set_names(gs$geneset) %>%
lapply(function(x){
x %>% .[[1]]
})
gs_mat3 <- gs %>%
dplyr::select(n_with_ire3,
n_without_ire3,
universe_with_ire3,
universe_without_ire3) %>%
apply(X = ., MARGIN = 1, FUN = function(x){
x %>%
matrix(2,2) %>%
t %>%
list()
}) %>% set_names(gs$geneset) %>%
lapply(function(x){
x %>% .[[1]]
})
gs_mat5 <- gs %>%
dplyr::select(n_with_ire5,
n_without_ire5,
universe_with_ire5,
universe_without_ire5) %>%
apply(X = ., MARGIN = 1, FUN = function(x){
x %>%
matrix(2,2) %>%
t %>%
list()
}) %>% set_names(gs$geneset) %>%
lapply(function(x){
x %>% .[[1]]
})
gs_mat[1:3]
$HALLMARK_TNFA_SIGNALING_VIA_NFKB
[,1] [,2]
[1,] 45 183
[2,] 3999 26904
$HALLMARK_HYPOXIA
[,1] [,2]
[1,] 51 163
[2,] 3993 26910
$HALLMARK_CHOLESTEROL_HOMEOSTASIS
[,1] [,2]
[1,] 18 60
[2,] 4026 26877
gs_mat3[1:3]
$HALLMARK_TNFA_SIGNALING_VIA_NFKB
[,1] [,2]
[1,] 31 183
[2,] 3004 26904
$HALLMARK_HYPOXIA
[,1] [,2]
[1,] 34 163
[2,] 3001 26910
$HALLMARK_CHOLESTEROL_HOMEOSTASIS
[,1] [,2]
[1,] 14 60
[2,] 3021 26877
gs_mat5[1:3]
$HALLMARK_TNFA_SIGNALING_VIA_NFKB
[,1] [,2]
[1,] 17 183
[2,] 1256 26904
$HALLMARK_HYPOXIA
[,1] [,2]
[1,] 19 163
[2,] 1254 26910
$HALLMARK_CHOLESTEROL_HOMEOSTASIS
[,1] [,2]
[1,] 5 60
[2,] 1268 26877
fisher_res <- gs_mat %>% lapply(function(x){
x %>% fisher.test()
})
fisher_res3 <- gs_mat3 %>% lapply(function(x){
x %>% fisher.test()
})
fisher_res5 <- gs_mat5 %>% lapply(function(x){
x %>% fisher.test()
})
fisher_res_p <- fisher_res %>% lapply(function(x){x$p.value})
fisher_res_p3 <- fisher_res3 %>% lapply(function(x){x$p.value})
fisher_res_p5 <- fisher_res5 %>% lapply(function(x){x$p.value})
gs %<>%
mutate(fisher_p = fisher_res_p%>%unlist%>%unname,
fisher_p_3 = fisher_res_p3%>%unlist%>%unname,
fisher_p_5 = fisher_res_p5%>%unlist%>%unname) %>%
mutate(fdr = p.adjust(fisher_p, "fdr"),
fdr_3 = p.adjust(fisher_p_3, "fdr"),
fdr_5 = p.adjust(fisher_p_5, "fdr"))
For each gene set, we will calculate the number of genes expected to have IREs (based on the background proportion) and observed value. This information will be added to the gs object.
gs %<>% rowwise() %>% mutate(
exp_allIRE = (universe_with_ire / universe_without_ire)*n_without_ire,
obs_allIRE = n_with_ire,
obs_greater_than_exp_allIRE = obs_allIRE > exp_allIRE,
exp_ire3 = (universe_with_ire3 / universe_without_ire3)*n_without_ire3,
obs_ire3 = n_with_ire3,
obs_greater_than_exp_ire3 = obs_ire3 > exp_ire3,
exp_ire5 = (universe_with_ire5 / universe_without_ire5)*n_without_ire5,
obs_ire5 = n_with_ire5,
obs_greater_than_exp_ire5 = obs_ire5 > exp_ire5
)
gs %>%
dplyr::select(geneset, contains("exp"), starts_with("obs"), ) %>% ungroup()
The following table shows the top 50 gene sets enriched in IRE genesets, ranked by Fisher’s exact test p-values:
gs %>% ungroup() %>%
arrange(fisher_p) %>%
dplyr::select(geneset, contains("fdr"), n_with_ire, n, starts_with("obs_greater"))
gs %>% saveRDS(here::here("R/GSEA/data/ora/human_gs.rds"))
Sorted by genesets most enriched in 3’ IRE genes:
gs %>% ungroup %>% arrange(fisher_p_3) %>% dplyr::select(geneset, contains("fdr"), n_with_ire3, n, starts_with("obs_greater")) %>% head(50)
Sorted by genesets most enriched in 5’ IRE genes:
gs %>% ungroup %>% arrange(fisher_p_5) %>% dplyr::select(geneset, contains("fdr"), n_with_ire5, n, starts_with("obs_greater")) %>% head(50)
The following stacked bar chart shows the overlap between the top ~20 genesets and the predicted-IRE genesets.
The stacked bar chart doesn’t indicate to us whether the IRE genes in each gene set are actually shared or if they’re unique. Overlapping sets through Venn diagrams or UpSet plots would be appropriate for showing this, but I’m going to use a network plot for the following reasons:
To use Gephi to produce a network plot, we need a nodes table and an edges table. Here I will produce the nodes table using the top 15 gene sets ranked by p-value.
We will also append the predicted IRE gene sets and the “Hallmark Heme Metabolism” gene set as this one is considered like a “gold standard” and the first point for testing for many as it’s included in the Hallmark collection.
gsComb$common_ids%>%
unlist %>%
table %>%
as.data.frame%>%
arrange(desc(Freq)) %>%
set_colnames(c("ensembl_gene_id", "freq")) %>% head
ensembl_gene_id freq
1 ENSG00000185043 91
2 ENSG00000010810 78
3 ENSG00000082701 78
4 ENSG00000100030 78
5 ENSG00000102081 66
6 ENSG00000148498 66
The final nodes table for gephi will contain
head(nodesDf2, 30)
Id ire
1 BLALOCK_ALZHEIMERS_DISEASE_UP
2 AACTTT_UNKNOWN
3 PILON_KLF1_TARGETS_DN
4 GO_CELL_PROJECTION_PART
5 GO_CELL_PROJECTION_ORGANIZATION
6 GRAESSMANN_APOPTOSIS_BY_DOXORUBICIN_DN
7 GO_NEURON_PART
8 GO_NEUROGENESIS
9 GO_INTRACELLULAR_TRANSPORT
10 PEREZ_TP53_TARGETS
11 GO_REGULATION_OF_ORGANELLE_ORGANIZATION
12 GO_CELLULAR_COMPONENT_MORPHOGENESIS
13 GO_CELLULAR_MACROMOLECULE_LOCALIZATION
14 GO_RESPONSE_TO_ENDOGENOUS_STIMULUS
15 GO_NEURON_PROJECTION
16 Predicted 3' IRE genes 3
17 Predicted 5' IRE genes 5
18 HALLMARK_HEME_METABOLISM
19 ENSG00000205307 5
20 ENSG00000164692 no IRE
21 ENSG00000112561 3
22 ENSG00000177302 no IRE
23 ENSG00000284238 no IRE
24 ENSG00000076716 no IRE
25 ENSG00000137364 no IRE
26 ENSG00000164292 3
27 ENSG00000113658 no IRE
28 ENSG00000176170 5
29 ENSG00000156103 no IRE
30 ENSG00000104936 no IRE
Edges will be between:
edgeDf %>% head(20)
Source Target
1 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000205307
2 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000164692
3 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000112561
4 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000177302
5 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000284238
6 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000076716
7 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000137364
8 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000164292
9 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000113658
10 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000176170
11 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000156103
12 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000104936
13 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000197429
14 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000140836
15 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000132330
16 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000145819
17 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000230989
18 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000211455
19 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000006652
20 BLALOCK_ALZHEIMERS_DISEASE_UP ENSG00000142541
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.6
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] pheatmap_1.0.12 biomaRt_2.38.0 org.Hs.eg.db_3.7.0 GSEABase_1.44.0 graph_1.60.0 annotate_1.60.1
[7] XML_3.98-1.20 gdtools_0.2.1 UpSetR_1.4.0 org.Dr.eg.db_3.7.0 fgsea_1.8.0 Rcpp_1.0.3
[13] reshape2_1.4.3 export_0.2.2 purrr_0.3.3 stringr_1.4.0 ggplot2_3.2.1 openxlsx_4.1.3
[19] here_0.1 readr_1.3.1 dplyr_0.8.3 tibble_2.1.3 magrittr_1.5 org.Mm.eg.db_3.7.0
[25] ensembldb_2.6.8 AnnotationFilter_1.6.0 GenomicFeatures_1.34.8 AnnotationDbi_1.44.0 GenomicRanges_1.34.0 GenomeInfoDb_1.18.2
[31] IRanges_2.16.0 S4Vectors_0.20.1 edgeR_3.24.3 limma_3.38.3 AnnotationHub_2.14.5 affy_1.60.0
[37] Biobase_2.42.0 BiocGenerics_0.28.0
loaded via a namespace (and not attached):
[1] colorspace_1.4-1 rprojroot_1.3-2 flextable_0.5.6 XVector_0.22.0
[5] base64enc_0.1-3 rstudioapi_0.10 affyio_1.52.0 bit64_0.9-7
[9] interactiveDisplayBase_1.20.0 fansi_0.4.0 xml2_1.2.2 knitr_1.26
[13] zeallot_0.1.0 jsonlite_1.6 Rsamtools_1.34.1 broom_0.5.2
[17] shiny_1.4.0 BiocManager_1.30.9 compiler_3.5.2 httr_1.4.1
[21] backports_1.1.5 assertthat_0.2.1 Matrix_1.2-17 fastmap_1.0.1
[25] lazyeval_0.2.2 cli_1.1.0 later_1.0.0 htmltools_0.4.0
[29] prettyunits_1.0.2 tools_3.5.2 gtable_0.3.0 glue_1.3.1
[33] GenomeInfoDbData_1.2.0 fastmatch_1.1-0 vctrs_0.2.0 Biostrings_2.50.2
[37] preprocessCore_1.44.0 nlme_3.1-142 stargazer_5.2.2 rtracklayer_1.42.2
[41] crosstalk_1.0.0 xfun_0.11 miniUI_0.1.1.1 mime_0.7
[45] lifecycle_0.1.0 zlibbioc_1.28.0 scales_1.0.0 hms_0.5.2
[49] promises_1.1.0 ProtGenerics_1.14.0 SummarizedExperiment_1.12.0 RColorBrewer_1.1-2
[53] yaml_2.2.0 curl_4.2 gridExtra_2.3 memoise_1.1.0
[57] stringi_1.4.3 RSQLite_2.1.2 manipulateWidget_0.10.0 zip_2.0.4
[61] BiocParallel_1.16.6 rlang_0.4.1 pkgconfig_2.0.3 systemfonts_0.1.1
[65] matrixStats_0.55.0 bitops_1.0-6 rgl_0.100.30 evaluate_0.14
[69] lattice_0.20-38 labeling_0.3 htmlwidgets_1.5.1 GenomicAlignments_1.18.1
[73] rvg_0.2.2 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4
[77] R6_2.4.1 generics_0.0.2 DelayedArray_0.8.0 DBI_1.0.0
[81] pillar_1.4.2 withr_2.1.2 RCurl_1.95-4.12 crayon_1.3.4
[85] uuid_0.1-2 utf8_1.1.4 rmarkdown_1.17 officer_0.3.6
[89] progress_1.2.2 locfit_1.5-9.1 grid_3.5.2 data.table_1.12.6
[93] blob_1.2.0 webshot_0.5.1 digest_0.6.22 xtable_1.8-4
[97] tidyr_1.0.0 httpuv_1.5.2 munsell_0.5.0